packages <- c("CIMseq", "CIMseq.data", "tidyverse", "scSeqTools", "printr")
purrr::walk(packages, library, character.only = TRUE)
rm(packages)

##DATA
load('../data/CIMseqData.rda')
load('../data/sObj.rda')
#rename classes
renameClasses <- function(class) {
  case_when(
    class == "0" ~ "C.Stem.proximal",
    class == "1" ~ "C.Stem.distal",
    class == "2" ~ "SI.Stem",
    class == "3" ~ "C.Goblet.distal",
    class == "4" ~ "SI.Enterocytes",
    class == "5" ~ "SI.Goblet",
    class == "6" ~ "C.Goblet.proximal",
    class == "7" ~ "Enteroendocrine",
    class == "8" ~ "Tufft",
    class == "9" ~ "SI.Paneth",
    class == "10" ~ "Blood",
    TRUE ~ "error"
  )
}

cOrder <- c(
  "C.Stem.proximal", "C.Goblet.proximal", "C.Stem.distal", "C.Goblet.distal",
  "SI.Goblet", "SI.Paneth", "SI.Stem", "SI.Enterocytes",
  "Enteroendocrine", "Tufft", "Blood"
)

getData(cObjSng, "classification") <- renameClasses(getData(cObjSng, "classification"))
features <- getData(cObjMul, "features")
names(features) <- renameClasses(names(features))
getData(cObjMul, "features") <- features
fractions <- getData(sObj, "fractions")
colnames(fractions) <- renameClasses(colnames(fractions))
sObj@fractions <- fractions

Classification

plotUnsupervisedClass(cObjSng, cObjMul)

plotUnsupervisedMarkers(
  cObjSng, cObjMul,
  c("Lgr5", "Ptprc", "Chga", "Dclk1", "Alpi", "Slc26a3", "Atoh1", "Lyz1"),
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

plotUnsupervisedMarkers(
  cObjSng, cObjMul, "Hoxb13",
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

plotUnsupervisedMarkers(
  cObjSng, cObjMul, "Mki67",
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

classHeatmap(
  data = data.frame(
    gene = rownames(getData(cObjMul, "counts"))[getData(cObjMul, "features")],
    class = names(getData(cObjMul, "features"))
  ),
  counts.log = getData(cObjSng, "counts.log"),
  classes = tibble(
    sample = colnames(getData(cObjSng, "counts")), 
    class = getData(cObjSng, "classification")
  )
)

Show top 10 (by fold change) features per class.

violinPlot <- function(cpm, genes, classes, class) {
  cpm[genes, ] %>%
    t() %>% 
    matrix_to_tibble("sample") %>%
    gather(gene, cpm, -sample) %>%
    full_join(tibble(sample = colnames(cpm), class = classes), by = "sample") %>%
    ggplot() +
    geom_jitter(aes(class, cpm), height = 0, size = 0.1, alpha = 0.25) +
    facet_wrap(~gene, scales = "free_y") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90)) +
    labs(title = paste0(class, " genes"))
}


getGenes <- function(
  cpm, classes, features, class, ntop = 10
) {
  fc <- foldChangePerClass(
    cpm[features[names(features) == class], ], 
    tibble(sample = colnames(cpm), class = classes)
  )
  rownames(fc[order(fc[, class], decreasing = TRUE), ])[1:ntop]
}

cpm <- getData(cObjSng, "counts.cpm")
c <- "C.Stem.distal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Stem.proximal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Goblet.distal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Goblet.proximal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Stem"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Goblet"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Enterocytes"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Paneth"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

Deconvolution

plotSwarmCircos(sObj, cObjSng, cObjMul, weightCut = 0, classOrder = cOrder)

Filtering

Only detected duplicates and triplicates.
Only ERCC estimated cell number <= 4.
Weight cutoff = 5.

adj <- adjustFractions(cObjSng, cObjMul, sObj, binary = TRUE)
samples <- rownames(adj)
rs <- rowSums(adj)
keep <- rs == 2 | rs == 3

plotSwarmCircos(
  filterSwarm(sObj, keep), cObjSng, cObjMul, weightCut = 5, 
  classOrder = cOrder, theoretical.max = 4
)

False positive stats

efm <- getEdgesForMultiplet(sObj, cObjSng, cObjMul, theoretical.max = 4) %>% 
  mutate(conn = map2(from, to, ~tibble(
    from = sort(c(.x, .y))[1], to = sort(c(.x, .y))[2]
  ))) %>% 
  select(-from, -to) %>% 
  unnest() %>% 
  distinct()

fp <- efm %>%
  mutate(fp = case_when(
    str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
    str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
    str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
    str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  group_by(sample) %>%
  summarize(fp = sum(fp)) %>%
  {setNames(pull(., fp), pull(., sample))}

1218 false positive connections were detected out of 2119 total connections. The per multiplet distribution is shown in the plot below.

ggplot() +
  geom_bar(aes(fp)) +
  theme_bw() +
  labs(x = "False positives per multiplet") +
  scale_x_continuous(breaks = 0:max(fp))

The type of false positive connections and their amount are shown below.

typeFreq <- efm %>%
  mutate(fp = case_when(
    str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
    str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
    str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
    str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  filter(fp) %>%
  count(from, to) %>%
  arrange(desc(n)) %>%
  as.data.frame()

typeFreq
from to n
C.Stem.distal C.Stem.proximal 312
C.Goblet.distal C.Stem.proximal 218
C.Stem.proximal SI.Stem 159
C.Goblet.distal C.Goblet.proximal 113
C.Goblet.proximal C.Stem.distal 86
C.Stem.proximal SI.Goblet 84
C.Stem.distal SI.Enterocytes 51
C.Stem.distal SI.Goblet 44
C.Goblet.distal SI.Goblet 39
C.Stem.distal SI.Stem 28
C.Stem.proximal SI.Enterocytes 27
C.Goblet.proximal SI.Goblet 24
C.Goblet.distal SI.Stem 12
C.Stem.proximal SI.Paneth 10
C.Goblet.distal SI.Enterocytes 4
C.Goblet.proximal SI.Stem 4
C.Goblet.proximal SI.Enterocytes 2
C.Stem.distal SI.Paneth 1
#number of features corresponding to class
#total sum of counts for features corresponding to class
# features <- getData(cObjMul, "features")
# cpm <- getData(cObjSng, "counts.cpm")
# classifications <- getData(cObjSng, "classification")
# rs <- rowSums(cpm)
# table(names(features))
# 
# geneDat <- tibble(class = names(features), gene = rownames(getData(cObjSng, "counts"))[features]) %>%
#   mutate(all = rs[gene]) %>%
#   mutate(classOnly = map2_dbl(class, gene, function(c, g) {
#     sum(cpm[g, classifications == c])
#   })) %>% 
#   group_by(class) %>%
#   summarize(sum.all = sum(all), sum.classOnly = sum(classOnly)) 
# 
# inner_join(typeFreq, geneDat, by = c("from" = "class")) %>%
#   inner_join(geneDat, by = c("to" = "class")) %>%
#   mutate(
#     sum.all = map2_dbl(sum.all.x, sum.all.y, ~sum(c(.x, .y))),
#     sum.classOnly = map2_dbl(sum.classOnly.x, sum.classOnly.y, ~sum(c(.x, .y)))
#   ) %>%
#   ggplot() +
# geom_point(aes(sum.all, n))
dc <- rowSums(adjustFractions(cObjSng, cObjMul, sObj))
tibble(
  sample = names(fp),
  `False positives (n)` = fp,
  `Deconvolution detected connections (n)` = dc[names(fp)]
) %>% 
  group_by(`Deconvolution detected connections (n)`) %>%
  summarize(`False positives (n)` = sum(`False positives (n)`)) %>%
  as.data.frame()
Deconvolution detected connections (n) False positives (n)
2 399
3 528
4 230
5 51
6 10

Correlation between various parameters and the number of false positives is shown below:

mulNames <- names(fp) #note other multiplets have 0 connections

ec <- estimateCells(cObjSng, cObjMul, warn = FALSE) %>%
  filter(sample %in% mulNames) %>%
  {setNames(pull(., estimatedCellNumber), pull(., sample))}
ec <- ec[mulNames]
dec <- rowSums(adjustFractions(cObjSng, cObjMul, sObj, binary = TRUE))
dec <- dec[mulNames]
c <- getData(sObj, "costs")
c <- c[mulNames]
tc <- colSums(getData(cObjMul, "counts"))
tc <- tc[mulNames]
dg <- apply(getData(cObjMul, "counts"), 2, function(c) length(which(c != 0)))
dg <- dg[mulNames]
feat.do <- getData(cObjMul, "counts.cpm")[features, ] %>%
  matrix_to_tibble("gene") %>%
  gather(sample, cpm, -gene) %>%
  group_by(sample) %>%
  summarize(do = length(which(cpm == 0))) %>%
  {setNames(pull(., do), pull(., sample))}
feat.do <- feat.do[mulNames]

cormat <- cor(matrix(c(fp, ec, dec, c, tc, dg, feat.do), ncol = 7))
corvec <- cormat[2:nrow(cormat), 1]
names(corvec) <- c(
  "ERCC estimated cell number", "Deconvolution estimated cell number", 
  "Deconvolution cost", "Total counts", "Total detected genes", 
  "Drop-outs in features"
)
data.frame(cor = corvec)
cor
ERCC estimated cell number 0.1920239
Deconvolution estimated cell number 0.7357167
Deconvolution cost 0.3197969
Total counts 0.3305745
Total detected genes 0.4754982
Drop-outs in features -0.4329068
cpm <- getData(cObjMul, "counts.cpm")
features <- getData(cObjMul, "features")
cpm[features, ] %>%
  matrix_to_tibble("gene") %>%
  gather(sample, cpm, -gene) %>%
  inner_join(tibble(class = names(features), gene = rownames(cpm)[features])) %>%
  group_by(sample, class) %>%
  summarize(cpm.sum = sum(cpm)) %>%
  ungroup() %>%
  inner_join(tibble(sample = names(fp), fp = fp)) %>%
  ggplot() +
  geom_point(aes(cpm.sum, fp)) +
  facet_wrap(~class)
#multiplets with high fp look to have low expression of the selected features

Distribution of fractions in connections thought to be true vs connections known to be false.

fractions <- getData(sObj, "fractions")
efm %>%
  mutate(fp = case_when(
    str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
    str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
    str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
    str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  mutate(fracs = pmap(list(sample, from, to), function(s, f, t) {
    fractions[s, c(f, t)]
  })) %>%
  unnest() %>%
  ggplot() +
  geom_boxplot(aes(fp, fracs)) +
  labs(x = "False positive", y = "Deconvolution fraction") +
  theme_bw()

Cluster multiplets and look for an overrepresentaion of false positives.

p.dist <- CIMseq::pearsonsDist(getData(cObjMul, "counts.cpm"), CIMseq::selectTopMax(getData(cObjMul, "counts.cpm"), 2000))
set.seed(234899)
u <- uwot::umap(p.dist)
rownames(u) <- colnames(getData(cObjMul, "counts"))
u %>% 
  matrix_to_tibble("sample") %>% 
  inner_join(tibble(sample = names(fp), fp = fp)) %>% 
  mutate(fp.bool = fp > 0) %>% 
  ggplot() + 
  geom_point(aes(V1, V2, colour = fp.bool)) +
  theme_bw() +
  labs(x = "UMAP1", y = "UMAP2") +
  guides(colour = guide_legend(title = "Has false positive?"))
## Joining, by = "sample"

u %>% 
  matrix_to_tibble("sample") %>% 
  inner_join(tibble(sample = names(fp), fp = fp)) %>% 
  ggplot() + 
  geom_point(aes(V1, V2, colour = fp)) +
  theme_bw() +
  scale_colour_viridis_c() +
  labs(x = "UMAP1", y = "UMAP2") +
  guides(colour = guide_legend(title = "Has false positive?"))
## Joining, by = "sample"